{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget --no-clobber -O ../leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install stringzilla memory_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext memory_profiler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import stringzilla as sz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pythonic_str: str = open(\"../leipzig1M.txt\", \"r\").read()\n",
    "sz_str = sz.Str(pythonic_str)\n",
    "pattern = \"the\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[sz.Str('A'),\n",
       " sz.Str('rebel'),\n",
       " sz.Str('statement'),\n",
       " sz.Str('sent'),\n",
       " sz.Str('to')]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import itertools\n",
    "top5 = itertools.islice(sz_str.split_iter(\" \"), 5) # grab the first five words\n",
    "top5 = list(top5)\n",
    "top5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "129,644,797 characters taking 129,644,797 bytes\n"
     ]
    }
   ],
   "source": [
    "print(f\"{len(pythonic_str):,} characters taking {len(sz_str):,} bytes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Both libraries report the same number of lines: 1,000,000\n"
     ]
    }
   ],
   "source": [
    "python_lines_count = pythonic_str.count(\"\\n\")\n",
    "sz_lines_count = sz_str.count(\"\\n\")\n",
    "assert python_lines_count == sz_lines_count\n",
    "print(f\"Both libraries report the same number of lines: {python_lines_count:,}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total of 20,191,473 words of average length ~6.42 characters bytes or ~6.42 bytes\n"
     ]
    }
   ],
   "source": [
    "count_words = pythonic_str.count(\" \")\n",
    "print(f\"Total of {count_words:,} words of average length ~{len(pythonic_str) / count_words:.2f} characters bytes or ~{len(sz_str) / count_words:.2f} bytes\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.33 s ± 4.52 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "651 ms ± 2.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "465 ms ± 2.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%timeit sum(1 for _ in pythonic_str.split())\n",
    "%timeit sum(1 for _ in sz_str.split())\n",
    "%timeit sum(1 for _ in sz_str.split_iter())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "peak memory: 2235.44 MiB, increment: 1422.00 MiB\n",
      "peak memory: 890.62 MiB, increment: 77.00 MiB\n",
      "peak memory: 890.62 MiB, increment: 0.00 MiB\n"
     ]
    }
   ],
   "source": [
    "%memit sum(1 for _ in pythonic_str.split())\n",
    "%memit sum(1 for _ in sz_str.split())\n",
    "%memit sum(1 for _ in sz_str.split_iter())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "143 ms ± 7.21 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 10\n",
    "random.choices(pythonic_str.splitlines(), k=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "25.1 ms ± 481 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 10\n",
    "sz_str.splitlines().sample(1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Throughput"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "506 ms ± 12.5 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 10\n",
    "sorted(pythonic_str.splitlines())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "382 ms ± 11.1 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 10\n",
    "sz_str.splitlines().sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "144 ms ± 4 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 100\n",
    "pythonic_str.count(pattern)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "31.2 ms ± 1.06 ms per loop (mean ± std. dev. of 100 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 100\n",
    "sz_str.count(pattern)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Latency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28.9 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "hash(pythonic_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "365 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "hash(sz_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.34 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "pythonic_str.find(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.37 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "sz_str.find(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "57.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "pythonic_str.partition(\" \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.65 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "sz_str.partition(\" \")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "pythonic_str.split(\" \").sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.86 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit -n 1 -r 1\n",
    "sz_str.split(\" \").sort()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
